A random n-class classification dataset can be generated using sklearn.datasets.make_classification. Here, we generate a dataset with two features and 1000 instances. Moreover, the dataset is generated for multiclass classification with five classes.
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from num2words import num2words
n_features =2
n_classes = 3
X, y = make_classification(n_samples = int((n_classes-1)*1e3),
n_features = n_features, n_redundant=0, n_classes = n_classes,
n_informative=2, random_state=1, n_clusters_per_class=1)
Labels_dict = dict(zip(list(np.unique(y)), [num2words(x).title() for x in np.unique(y)]))
Data = pd.DataFrame(data = X, columns = ['Feature %i' % (i+1) for i in range(n_features)])
Target = 'Outcome Variable'
Data[Target] = y
display(Data)
from HD_DeepLearning import Plot_Data
PD = dict(BP = .5, alpha=.7, bg_alpha = 0.25, grid = True, cricle_size = 50,
FigSize = 7, h=0.02, pad=1, ColorMap = 'Set1', Labels = list(Labels_dict.values()))
Plot_Data(X, y, PD = PD, Labels_dict = Labels_dict, ax = None)
| Feature 1 | Feature 2 | Outcome Variable | |
|---|---|---|---|
| 0 | 0.421823 | -1.258802 | 2 |
| 1 | 1.174360 | 1.586866 | 0 |
| 2 | -0.444844 | 0.623748 | 2 |
| 3 | 1.286082 | 1.791197 | 0 |
| 4 | 1.050679 | 1.105048 | 0 |
| ... | ... | ... | ... |
| 1995 | -1.929029 | 0.119340 | 2 |
| 1996 | -0.010248 | -0.785788 | 1 |
| 1997 | 1.796874 | 3.145459 | 0 |
| 1998 | 1.656980 | 2.623708 | 0 |
| 1999 | 0.820434 | 0.450676 | 0 |
2000 rows × 3 columns
Pull = [.01 for x in range((len(Labels_dict)-1))]
Pull.append(.1)
import plotly.express as px
from HD_DeepLearning import DatasetTargetDist
PD = dict(PieColors = px.colors.sequential.Plasma_r, TableColors = ['Navy','White'], hole = .4,
column_widths=[0.6, 0.4], textfont = 14, height = 400, tablecolumnwidth = [0.25, 0.15, 0.15],
pull = Pull, legend_title = Target, title_x = 0.5, title_y = .9, pie_legend = [0.01, 0.01])
del Pull
DatasetTargetDist(Data, Target, Labels_dict, PD, orientation= 'columns')
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
from sklearn.model_selection import StratifiedShuffleSplit
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
from HD_DeepLearning import Train_Test_Dist
PD.update(dict(column_widths=[0.3, 0.3, 0.3], tablecolumnwidth = [0.2, 0.4], height = 550, legend_title = Target))
Train_Test_Dist(X_train, y_train, X_test, y_test, PD, Labels_dict)
Multinomial logistic regression is a classification method that generalizes logistic regression to multiclass problems.
import torch
def TorchSets(Set):
# Inut: Arrays
# GPU Cuda
if isinstance(Set, (pd.DataFrame, pd.Series)):
Set = Set.values
if torch.cuda.is_available():
if Set.ndim==1:
Out = torch.autograd.Variable(torch.from_numpy(Set).type(torch.LongTensor).cuda())
else:
Out = torch.autograd.Variable(torch.from_numpy(Set).cuda())
# CPU
else:
if Set.ndim==1:
Out = torch.autograd.Variable(torch.from_numpy(Set).type(torch.LongTensor))
else:
Out = torch.autograd.Variable(torch.from_numpy(Set))
return Out
# Tensors
X_train_tensor = TorchSets(X_train)
y_train_tensor = TorchSets(y_train)
X_test_tensor = TorchSets(X_test)
y_test_tensor = TorchSets(y_test)
Batch_size = 100
iteration_number = int(3e2)
epochs_number = int(iteration_number / (len(X_train) / Batch_size))
# Pytorch train and test sets
Train_set = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
Test_set = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
# data loader
train_loader = torch.utils.data.DataLoader(Train_set, batch_size = Batch_size, shuffle = False)
test_loader = torch.utils.data.DataLoader(Train_set, batch_size = Batch_size, shuffle = False)
class LogisticRegressionModel(torch.nn.Module):
def __init__(self, input_Size, output_Size):
super(LogisticRegressionModel, self).__init__()
self.linear = torch.nn.Linear(input_Size, output_Size)
def forward(self, x):
out = self.linear(x)
return out
Fitting the model
input_Size, output_Size = n_features, len(Labels_dict)
hidden_Size = 256
# model
model = LogisticRegressionModel(input_Size, output_Size)
# GPU
if torch.cuda.is_available():
model.cuda()
# Cross Entropy Loss
criterion= torch.nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr= 1e-2, momentum=.9)
# Traning the Model
Count = 0
Loss_list = []
Iteration_list = []
Accuracy_list = []
MSE_list = []
MAE_list = []
Steps = 10
import progressbar
Progress_Bar = progressbar.ProgressBar(maxval= iteration_number + 200,
widgets=[progressbar.Bar('=', '|', '|'),
progressbar.Percentage()])
for epoch in range(epochs_number):
for i, (Xtr, ytr) in enumerate(train_loader):
# Variables
Xtr = torch.autograd.Variable(Xtr.view(-1, n_features))
ytr = torch.autograd.Variable(ytr)
# Set all gradients to zero
optimizer.zero_grad()
# Forward
Out = model(Xtr.float())
# loss
loss = criterion(Out, ytr.long())
# Backward (Calculating the gradients)
loss.backward()
# Update parameters
optimizer.step()
Count += 1
del Xtr, ytr
# Predictions
if Count % Steps == 0:
# Calculate Accuracy
Correct, Total = 0, 0
# Predictions
for Xts, yts in test_loader:
Xts = torch.autograd.Variable(Xts.view(-1, n_features))
# Forward
Out = model(Xts.float())
# The maximum value of Out
Predicted = torch.max(Out.data, 1)[1]
# Total number of yts
Total += len(yts)
# Total Correct predictions
Correct += (Predicted == yts).sum()
del Xts, yts
# storing loss and iteration
Loss_list.append(loss.data)
Iteration_list.append(Count)
Accuracy_list.append(Correct / float(Total))
Progress_Bar.update(Count)
Progress_Bar.finish()
history = pd.DataFrame({'Iteration': np.array(Iteration_list),
'Loss': np.array([x.cpu().data.numpy() for x in Loss_list]),
'Accuracy': np.array([x.cpu().data.numpy() for x in Accuracy_list])})
del Loss_list, Iteration_list, Accuracy_list
|=========================================================================|100%
Model Performance
from HD_DeepLearning import Plot_history
PD = dict(row_heights = [0.4, 0.6], lw = 1.5, font_size=12, height = 700, yLim = 1.5,
th_line_color = 'Navy', th_fill_color='darkslategray', table_columnwidth = [0.4, 0.4, 0.4, 0.4],
tc_line_color = 'Navy', tc_fill_color = None, title_x = 0.46, title_y = 0.92, tb_cell_heigh = 20,
Number_Format = '%.4e')
Plot_history(history, PD, Title = 'Test Set', Colors = ['DarkGreen', 'Red'])
from HD_DeepLearning import Plot_Classification_Torch
import matplotlib.pyplot as plt
PD = dict(BP = .5, alpha=.7, bg_alpha = 0.15, grid = False, cricle_size = 50,
FigSize = 7, h=0.02, pad=1, ColorMap = 'Set1', Labels = list(Labels_dict.values()))
fig, ax = plt.subplots(1, 2, figsize=(16, 7))
# Train Set
Plot_Classification_Torch(model, X_train, y_train, PD = PD, ax = ax[0])
_ = ax[0].set_title('Train Set', fontsize = 16, weight='bold')
# Test Set
Plot_Classification_Torch(model, X_test, y_test, PD = PD, ax = ax[1])
_ = ax[1].set_title('Test Set', fontsize = 16, weight='bold')
The confusion matrix allows for visualization of the performance of an algorithm. Note that due to the size of data, here we don't provide a Cross-validation evaluation. In general, this type of evaluation is preferred.
from sklearn import metrics
# Train
y_pred = model(X_train_tensor.float())
y_pred = torch.max(y_pred.data, 1)[1]
y_pred = y_pred.cpu().data.numpy()
Reports_Train = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
y_pred = model(X_test_tensor.float())
y_pred = torch.max(y_pred.data, 1)[1]
y_pred = y_pred.cpu().data.numpy()
Reports_Test = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()),
output_dict=True)).T
CM_Test = metrics.confusion_matrix(y_test, y_pred)
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set'})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set'})
display(Reports_Train.style.hide(axis='index').set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set'], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide(axis='index').set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set'], **{'background-color': 'RoyalBlue', 'color': 'White'}))
from HD_DeepLearning import Confusion_Mat
PD = dict(FS = (14, 6), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
| Train Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Zero | 0.917004 | 0.970021 | 0.942768 | 467.000000 |
| One | 0.838926 | 0.797872 | 0.817884 | 470.000000 |
| Two | 0.771242 | 0.764579 | 0.767896 | 463.000000 |
| accuracy | 0.844286 | 0.844286 | 0.844286 | 0.844286 |
| macro avg | 0.842391 | 0.844158 | 0.842849 | 1400.000000 |
| weighted avg | 0.842587 | 0.844286 | 0.843010 | 1400.000000 |
| Test Set | precision | recall | f1-score | support |
|---|---|---|---|---|
| Zero | 0.918269 | 0.955000 | 0.936275 | 200.000000 |
| One | 0.854167 | 0.815920 | 0.834606 | 201.000000 |
| Two | 0.775000 | 0.778894 | 0.776942 | 199.000000 |
| accuracy | 0.850000 | 0.850000 | 0.850000 | 0.850000 |
| macro avg | 0.849145 | 0.849938 | 0.849274 | 600.000000 |
| weighted avg | 0.849277 | 0.850000 | 0.849370 | 600.000000 |